\begin{table}[ht]
	\caption{Benchmark table of model results for most human-like behaviour. The three metrics ``accuracy difference'' ``observed consistency'' and ``error consistency'' (plotted in Figure~\ref{fig:benchmark_barplots}) each produce a different model ranking. The mean rank of a model across those three metrics is used to rank the models on our benchmark.}
	\label{tab:benchmark_table_humanlike}
	\centering
	\input{assets/benchmark_table_humanlike.tex}
\end{table}

\begin{table}[h!]
	\caption{Benchmark table of model results for highest out-of-distribution robustness.}
	\label{tab:benchmark_table_accurate}
	\centering
	\input{assets/benchmark_table_accuracy.tex}
\end{table}


